#ifndef HEADER_BN_LCL_H
#define HEADER_BN_LCL_H

#include "bn.h"

#ifdef __cplusplus
extern "C" {
#endif

#define MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH (64)
#define MOD_EXP_CTIME_MIN_CACHE_LINE_MASK                                      \
	(MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - 1)

#if MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 64

#define BN_window_bits_for_ctime_exponent_size(b)                              \
	((b) > 937 ? 6 : (b) > 306 ? 5 : (b) > 89 ? 4 : (b) > 22 ? 3 : 1)
#define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (6)

#elif MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 32

#define BN_window_bits_for_ctime_exponent_size(b)                              \
	((b) > 306 ? 5 : (b) > 89 ? 4 : (b) > 22 ? 3 : 1)
#define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (5)

#endif

/* Pentium pro 16,16,16,32,64 */
/* Alpha       16,16,16,16.64 */
#define BN_MULL_SIZE_NORMAL (16) /* 32 */
#define BN_MUL_RECURSIVE_SIZE_NORMAL (16) /* 32 less than */
#define BN_SQR_RECURSIVE_SIZE_NORMAL (16) /* 32 */
#define BN_MUL_LOW_RECURSIVE_SIZE_NORMAL (32) /* 32 */
#define BN_MONT_CTX_SET_SIZE_WORD (64) /* 32 */

#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) &&             \
	!defined(PEDANTIC)
/*
 * BN_UMULT_HIGH section.
 *
 * No, I'm not trying to overwhelm you when stating that the
 * product of N-bit numbers is 2*N bits wide:-) No, I don't expect
 * you to be impressed when I say that if the compiler doesn't
 * support 2*N integer type, then you have to replace every N*N
 * multiplication with 4 (N/2)*(N/2) accompanied by some shifts
 * and additions which unavoidably results in severe performance
 * penalties. Of course provided that the hardware is capable of
 * producing 2*N result... That's when you normally start
 * considering assembler implementation. However! It should be
 * pointed out that some CPUs (most notably Alpha, PowerPC and
 * upcoming IA-64 family:-) provide *separate* instruction
 * calculating the upper half of the product placing the result
 * into a general purpose register. Now *if* the compiler supports
 * inline assembler, then it's not impossible to implement the
 * "bignum" routines (and have the compiler optimize 'em)
 * exhibiting "native" performance in C. That's what BN_UMULT_HIGH
 * macro is about:-)
 *
 *					<appro@fy.chalmers.se>
 */
#if defined(__alpha) &&                                                        \
	(defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
#if defined(__DECC)
#include <c_asm.h>
#define BN_UMULT_HIGH(a, b) (BN_ULONG) asm("umulh %a0,%a1,%v0", (a), (b))
#elif defined(__GNUC__)
#define BN_UMULT_HIGH(a, b)                                                    \
	({                                                                     \
		register BN_ULONG ret;                                         \
		asm("umulh	%1,%2,%0" : "=r"(ret) : "r"(a), "r"(b));        \
		ret;                                                           \
	})
#endif /* compiler */
#elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG)
#if defined(__GNUC__)
#define BN_UMULT_HIGH(a, b)                                                    \
	({                                                                     \
		register BN_ULONG ret;                                         \
		asm("mulhdu	%0,%1,%2" : "=r"(ret) : "r"(a), "r"(b));        \
		ret;                                                           \
	})
#endif /* compiler */
#elif defined(__x86_64) && defined(SIXTY_FOUR_BIT_LONG)
#if defined(__GNUC__)
#define BN_UMULT_HIGH(a, b)                                                    \
	({                                                                     \
		register BN_ULONG ret, discard;                                \
		asm("mulq	%3"                                                  \
		    : "=a"(discard), "=d"(ret)                                 \
		    : "a"(a), "g"(b)                                           \
		    : "cc");                                                   \
		ret;                                                           \
	})
#define BN_UMULT_LOHI(low, high, a, b)                                         \
	asm("mulq	%3" : "=a"(low), "=d"(high) : "a"(a), "g"(b) : "cc");
#endif
#elif (defined(_M_AMD64) || defined(_M_X64)) && defined(SIXTY_FOUR_BIT)
#if defined(_MSC_VER) && _MSC_VER >= 1400
unsigned __int64 __umulh(unsigned __int64 a, unsigned __int64 b);
unsigned __int64 _umul128(unsigned __int64 a, unsigned __int64 b,
			  unsigned __int64 *h);
#pragma intrinsic(__umulh, _umul128)
#define BN_UMULT_HIGH(a, b) __umulh((a), (b))
#define BN_UMULT_LOHI(low, high, a, b) ((low) = _umul128((a), (b), &(high)))
#endif
#endif /* cpu */
#endif /* OPENSSL_NO_ASM */

/*************************************************************
 * Using the long long type
 */
#define Lw(t) (((BN_ULONG)(t)) & BN_MASK2)
#define Hw(t) (((BN_ULONG)((t) >> BN_BITS2)) & BN_MASK2)

#ifdef BN_DEBUG_RAND
#define bn_clear_top2max(a)                                                    \
	{                                                                      \
		int ind       = (a)->dmax - (a)->top;                          \
		BN_ULONG *ftl = &(a)->d[(a)->top - 1];                         \
		for (; ind != 0; ind--)                                        \
			*(++ftl) = 0x0;                                        \
	}
#else
#define bn_clear_top2max(a)
#endif

#ifdef BN_LLONG
#define mul_add(r, a, w, c)                                                    \
	{                                                                      \
		BN_ULLONG t;                                                   \
		t   = (BN_ULLONG)w * (a) + (r) + (c);                          \
		(r) = Lw(t);                                                   \
		(c) = Hw(t);                                                   \
	}

#define mul(r, a, w, c)                                                        \
	{                                                                      \
		BN_ULLONG t;                                                   \
		t   = (BN_ULLONG)w * (a) + (c);                                \
		(r) = Lw(t);                                                   \
		(c) = Hw(t);                                                   \
	}

#define sqr(r0, r1, a)                                                         \
	{                                                                      \
		BN_ULLONG t;                                                   \
		t    = (BN_ULLONG)(a) * (a);                                   \
		(r0) = Lw(t);                                                  \
		(r1) = Hw(t);                                                  \
	}

#elif defined(BN_UMULT_LOHI)
#define mul_add(r, a, w, c)                                                    \
	{                                                                      \
		BN_ULONG high, low, ret, tmp = (a);                            \
		ret = (r);                                                     \
		BN_UMULT_LOHI(low, high, w, tmp);                              \
		ret += (c);                                                    \
		(c) = (ret < (c)) ? 1 : 0;                                     \
		(c) += high;                                                   \
		ret += low;                                                    \
		(c) += (ret < low) ? 1 : 0;                                    \
		(r) = ret;                                                     \
	}

#define mul(r, a, w, c)                                                        \
	{                                                                      \
		BN_ULONG high, low, ret, ta = (a);                             \
		BN_UMULT_LOHI(low, high, w, ta);                               \
		ret = low + (c);                                               \
		(c) = high;                                                    \
		(c) += (ret < low) ? 1 : 0;                                    \
		(r) = ret;                                                     \
	}

#define sqr(r0, r1, a)                                                         \
	{                                                                      \
		BN_ULONG tmp = (a);                                            \
		BN_UMULT_LOHI(r0, r1, tmp, tmp);                               \
	}

#elif defined(BN_UMULT_HIGH)
#define mul_add(r, a, w, c)                                                    \
	{                                                                      \
		BN_ULONG high, low, ret, tmp = (a);                            \
		ret  = (r);                                                    \
		high = BN_UMULT_HIGH(w, tmp);                                  \
		ret += (c);                                                    \
		low = (w)*tmp;                                                 \
		(c) = (ret < (c)) ? 1 : 0;                                     \
		(c) += high;                                                   \
		ret += low;                                                    \
		(c) += (ret < low) ? 1 : 0;                                    \
		(r) = ret;                                                     \
	}

#define mul(r, a, w, c)                                                        \
	{                                                                      \
		BN_ULONG high, low, ret, ta = (a);                             \
		low  = (w)*ta;                                                 \
		high = BN_UMULT_HIGH(w, ta);                                   \
		ret  = low + (c);                                              \
		(c)  = high;                                                   \
		(c) += (ret < low) ? 1 : 0;                                    \
		(r) = ret;                                                     \
	}

#define sqr(r0, r1, a)                                                         \
	{                                                                      \
		BN_ULONG tmp = (a);                                            \
		(r0)	 = tmp * tmp;                                      \
		(r1)	 = BN_UMULT_HIGH(tmp, tmp);                        \
	}

#else
/*************************************************************
 * No long long type
 */

#define LBITS(a) ((a)&BN_MASK2l)
#define HBITS(a) (((a) >> BN_BITS4) & BN_MASK2l)
#define L2HBITS(a) (((a) << BN_BITS4) & BN_MASK2)

#define LLBITS(a) ((a)&BN_MASKl)
#define LHBITS(a) (((a) >> BN_BITS2) & BN_MASKl)
#define LL2HBITS(a) ((BN_ULLONG)((a)&BN_MASKl) << BN_BITS2)

#define mul64(l, h, bl, bh)                                                    \
	{                                                                      \
		BN_ULONG m, m1, lt, ht;                                        \
                                                                               \
		lt = l;                                                        \
		ht = h;                                                        \
		m  = (bh) * (lt);                                              \
		lt = (bl) * (lt);                                              \
		m1 = (bl) * (ht);                                              \
		ht = (bh) * (ht);                                              \
		m  = (m + m1) & BN_MASK2;                                      \
		if (m < m1)                                                    \
			ht += L2HBITS((BN_ULONG)1);                            \
		ht += HBITS(m);                                                \
		m1 = L2HBITS(m);                                               \
		lt = (lt + m1) & BN_MASK2;                                     \
		if (lt < m1)                                                   \
			ht++;                                                  \
		(l) = lt;                                                      \
		(h) = ht;                                                      \
	}

#define sqr64(lo, ho, in)                                                      \
	{                                                                      \
		BN_ULONG l, h, m;                                              \
                                                                               \
		h = (in);                                                      \
		l = LBITS(h);                                                  \
		h = HBITS(h);                                                  \
		m = (l) * (h);                                                 \
		l *= l;                                                        \
		h *= h;                                                        \
		h += (m & BN_MASK2h1) >> (BN_BITS4 - 1);                       \
		m = (m & BN_MASK2l) << (BN_BITS4 + 1);                         \
		l = (l + m) & BN_MASK2;                                        \
		if (l < m)                                                     \
			h++;                                                   \
		(lo) = l;                                                      \
		(ho) = h;                                                      \
	}

#define mul_add(r, a, bl, bh, c)                                               \
	{                                                                      \
		BN_ULONG l, h;                                                 \
                                                                               \
		h = (a);                                                       \
		l = LBITS(h);                                                  \
		h = HBITS(h);                                                  \
		mul64(l, h, (bl), (bh));                                       \
                                                                               \
		/* non-multiply part */                                        \
		l = (l + (c)) & BN_MASK2;                                      \
		if (l < (c))                                                   \
			h++;                                                   \
		(c) = (r);                                                     \
		l   = (l + (c)) & BN_MASK2;                                    \
		if (l < (c))                                                   \
			h++;                                                   \
		(c) = h & BN_MASK2;                                            \
		(r) = l;                                                       \
	}

#define mul(r, a, bl, bh, c)                                                   \
	{                                                                      \
		BN_ULONG l, h;                                                 \
                                                                               \
		h = (a);                                                       \
		l = LBITS(h);                                                  \
		h = HBITS(h);                                                  \
		mul64(l, h, (bl), (bh));                                       \
                                                                               \
		/* non-multiply part */                                        \
		l += (c);                                                      \
		if ((l & BN_MASK2) < (c))                                      \
			h++;                                                   \
		(c) = h & BN_MASK2;                                            \
		(r) = l & BN_MASK2;                                            \
	}
#endif /* !BN_LLONG */

void bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b, int nb);
void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b);
void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b);
void bn_sqr_normal(BN_ULONG *r, const BN_ULONG *a, int n, BN_ULONG *tmp);
void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a);
void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a);
int bn_cmp_words(const BN_ULONG *a, const BN_ULONG *b, int n);
int bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b, int cl, int dl);
void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2, int dna,
		      int dnb, BN_ULONG *t);
void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n,
			   int tna, int tnb, BN_ULONG *t);
void bn_sqr_recursive(BN_ULONG *r, const BN_ULONG *a, int n2, BN_ULONG *t);
void bn_mul_low_normal(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n);
void bn_mul_low_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
			  BN_ULONG *t);
void bn_mul_high(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, BN_ULONG *l, int n2,
		 BN_ULONG *t);
BN_ULONG bn_add_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
			   int cl, int dl);
BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
			   int cl, int dl);
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
		const BN_ULONG *np, const BN_ULONG *n0, int num);

#ifdef __cplusplus
}
#endif

#endif
